EDA Practice
EDA¶
PACKAGES¶
In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
LOAD DATA¶
In [68]:
df = pd.read_csv("http://www.ishelp.info/data/insurance.csv")
SUMMERY_OF_DATA¶
In [70]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df_av = AV.AutoViz('http://www.ishelp.info/data/insurance.csv')
In [71]:
pp.ProfileReport(df)
Out[71]:
In [ ]:
df.head()
In [5]:
df.describe()
Out[5]:
In [6]:
df.shape
Out[6]:
In [45]:
df.columns
Out[45]:
In [46]:
print (f'age: {df.age.count()}')
print (f'sex: {df.sex.count()}')
print (f'bmi: {df.bmi.count()}')
print (f'children: {df.children.count()}')
print (f'smoker: {df.smoker.count()}')
print (f'region: {df.region.count()}')
print (f'charges: {df.charges.count()}')
missing_value¶
In [47]:
print (f'age: {df.age.nunique()}')
print (f'sex: {df.sex.nunique()}')
print (f'bmi: {df.bmi.nunique()}')
print (f'children: {df.children.nunique()}')
print (f'smoker: {df.smoker.nunique()}')
print (f'region: {df.region.nunique()}')
print (f'charges: {df.charges.nunique()}')
typing¶
In [48]:
print (f'age : {df.age.dtype}')
print (f'sex: {df.sex.dtype}')
print (f'bmi: {df.bmi.dtype}')
print (f'children: {df.children.dtype}')
print (f'smoker: {df.smoker.dtype}')
print (f'region: {df.region.dtype}')
print (f'charges: {df.charges.dtype}')
isnull¶
In [50]:
print (f'age : {df.age.isnull().sum()}')
print (f'sex: {df.sex.isnull().sum()}')
print (f'bmi: {df.bmi.isnull().sum()}')
print (f'children: {df.children.isnull().sum()}')
print (f'smoker: {df.smoker.isnull().sum()}')
print (f'region: {df.region.isnull().sum()}')
print (f'charges: {df.charges.isnull().sum()}')
In [55]:
for col in df :
print(df.isnull().sum())
In [59]:
df.values
Out[59]:
LABEL_ENCODER¶
In [61]:
#replaced_base['Ward']=dropped_base['Ward'].replace(['BMT1', 'BMT2', "BMT3","BMT4"],
#[0, 1, 2 , 3], inplace=False)
In [76]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df["sex"] = le.fit_transform(df["sex"])
In [77]:
df
Out[77]:
In [78]:
for col in df.columns:
if df.dtypes[col] == 'object':
print(col)
#base[col] = le.fit_transform(base[col])
In [79]:
dropped_df=df.dropna(axis=1, how='all', thresh=200, subset=None, inplace=False)
dropped_df
Out[79]:
Importing Needed packages¶
In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline
Downloading Data¶
In [9]:
!wget -O FuelConsumption.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%202/data/FuelConsumptionCo2.csv
Reading the data in¶
In [4]:
df = pd.read_csv("FuelConsumption.csv")
In [5]:
df.head()
# take a look at the dataset
Out[5]:
Data Exploration¶
Let's first have a descriptive exploration on our data
In [10]:
# summarize the data
df.describe()
Out[10]:
Let's select some features to explore more.
In [12]:
cdf = df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB','CO2EMISSIONS']]
cdf.head(9)
Out[12]:
We can plot each of these features:
In [13]:
viz = cdf[['CYLINDERS','ENGINESIZE','CO2EMISSIONS','FUELCONSUMPTION_COMB']]
viz.hist()
plt.show()
Now, let's plot each of these features against the Emission, to see how linear their relationship is:
In [15]:
plt.scatter(cdf.FUELCONSUMPTION_COMB, cdf.CO2EMISSIONS, color='blue')
plt.xlabel("FUELCONSUMPTION_COMB")
plt.ylabel("Emission")
plt.show()
In [ ]:
Comments
Comments powered by Disqus